python -m torch.distributed.launch $DISTRIBUTED_ARGS  /mnt/nasdata/megatron/pretrain_t5.py --override-lr-scheduler --adam-beta1 0.9 --adam-beta2 0.999 --tensor-model-parallel-size 1 --lr-decay-iters 1000000 --kv-channels 64 --ffn-hidden-size 3072 --encoder-seq-length 768 --decoder-seq-length 768 --micro-batch-size $BATCH_SIZE --num-layers 16 --hidden-size 1280 --num-attention-heads 16 --max-position-embeddings 1024 --train-iters 2000000 --lr 0.0001 --min-lr 0.00001 --lr-decay-style linear --split 949,50,1 --log-interval 1 --eval-interval 1000 --eval-iters 10 --weight-decay 1e-2 --clip-grad 1.0 --vocab-extra-ids 100 --num-workers --log-optimizer-states-to-tensorboard --vocab-file /mnt/nasdata/wikidata/bert-vocab.txt --data-path /mnt/nasdata/megatron/my-bert_text_sentence --data-impl mmap --deepspeed  --zero-stage 2 --no-pipeline-parallel --cpu-optimizer --deepspeed_config /mnt/nasdata/megatron/examples/t5_with_pile/ds_offload_t5.json